library(plyr)
library(ggplot2)
library(directlabels)
library(reshape)
library(grid)
library(gridExtra)
library(stargazer)
library(data.table)

# PATHS
projectPath <- "path-to-data-appendix/3_Analysis_of_TAQ_data/"
dataPath <- paste0(projectPath, "WRDS_Server/Output/")


### NOTE about exchange start/end dates
# ISE STOPPED showing up on the system on "2010-07-19"
# EDGA STARTED showing up on the system on "2010-07-19"
# EDGX STARTED showing up on the system on "2010-07-19"
# NSX STOPPED showing up on the system on "2014-06-02" but started trading again on "2015-12-22"
# CBOE STOPPED showing up on the system on "2014-05-01"
# Boston Stock Exchange STOPPED showing up on system on "2007-09-05" but STARTED trading again on "2009-01-16" as Nasdaq OMX BX
# BATS STARTED showing up on the system on "2008-10-28" (only for a few symbols) but most symbols appeared on "2008-11-05" 
# BATS-Y STARTED showing up on the system on "2010-10-15" for select few symbols but finished rolling out all symbols on "2010-10-22"

# NSDQ2 represents "NASDAQ (Tape C)". We have aggregated all the NASDAQ observations into exchange code "T" in the SAS files.

# Function to get exchange abbreviations
getAbbr <- function(x){
  name <- "NA"
  if(x == "A") name <- "AMEX"
  if(x == "B") name <- "BX"
  if(x == "N") name <- "NYSE"
  if(x == "P") name <- "ARCA"
  if(x == "C") name <- "NSX"
  if(x == "T") name <- "NSDQ"
  if(x == "Q") name <- "NSDQ2"
  if(x == "D") name <- "FINRA"
  if(x == "X") name <- "PSX"
  if(x == "M") name <- "CHX"
  if(x == "Z") name <- "BZX"
  if(x == "Y") name <- "BYX"
  if(x == "K") name <- "EDGX"
  if(x == "J") name <- "EDGA"
  if(x == "W") name <- "CBOE"
  if(x == "V") name <- "IEX"
  if(x == "I") name <- "ISE"
  return(name)
}
getAbbr <- Vectorize(getAbbr)

# T is NASDAQ tape A and B. Q is Tape C
getEXname <- function(x){
  name <- "NA"
  if(x == "A") name <- "NYSE MKT LLC"
  if(x == "B") name <- "NASDAQ OMX BX, Inc."
  if(x == "N") name <- "New York Stock Exchange LLC"
  if(x == "P") name <- "NYSE Arca, Inc."
  if(x == "C") name <- "National Stock Exchange, Inc."
  if(x == "T") name <- "The Nasdaq Stock Market LLC (Tape A and B)"
  if(x == "Q") name <- "NASDAQ Tape C"
  if(x == "D") name <- "FINRA"
  if(x == "X") name <- "NASDAQ OMX PSX LLC"
  if(x == "M") name <- "Chicago Stock Exchange, Inc."
  if(x == "Z") name <- "BATS BZX Exchange, Inc."
  if(x == "Y") name <- "BATS BYZ Exchange, Inc."
  if(x == "K") name <- "BATS EDGX Exchange, Inc."
  if(x == "J") name <- "BATS EDGA Exchange, Inc."
  if(x == "W") name <- "CBOE Stock Exchange LLC"
  if(x == "V") name <- "The Investors' Exchange"
  if(x == "I") name <- "International Securities Exchange LLC"
  return(name)
}
getEXname <- Vectorize(getEXname)

top8 <- c("The Nasdaq Stock Market LLC (Tape A and B)","New York Stock Exchange LLC","NYSE Arca, Inc.","BATS BZX Exchange, Inc.","BATS EDGX Exchange, Inc.",
          "NASDAQ OMX BX, Inc.","BATS EDGA Exchange, Inc.","BATS BYZ Exchange, Inc.")


is.labelOverlap <- function(data, vSpacing, var){
  
  # Checks whether there is enough space on the y-axis for the labels
  
  data <- data[order(-data[,var]), ]
  for(i in 2:length(data[,var])){
    if(data[i-1,var] - data[i,var] < vSpacing & !is.na(data[i,var])){
      return(TRUE)
    }
  }
  return(FALSE)
  
}
adjust.exchangeLabels <- function(data, vSpacing, var){
  
  labels <- data[order(-data[,var]), ]
  N = sum(!is.na(data[,var]))
  
  while(is.labelOverlap(labels, vSpacing, var)){
    for(i in 2:N){
      if(labels[i-1,var] - labels[i,var] < vSpacing){
        labels[i,var] <- labels[i,var] - vSpacing/2
        labels[i-1,var] <- labels[i-1,var] + vSpacing/2
      }
    }
  }
  return(labels)
}
plot.exchangeMarketShares <- function(data, interval = 'date', var = 'marketShare',
                                      myOutputFolder, figurePath = '', vertLines = FALSE,
                                      lineSize = NULL, myLineSize = 1.3,
                                      labelSpacing = 0.025){
  
  data$date <- data[,interval]
  data$marketShare <- data[,var]
  
  # Add some spacing at the end of the plot, proportional to the length of the period
  myRightSpacing = (max(data$date) - min(data$date)) * labelSpacing
  
  # Vertical spacing, proportional to the height of the plot (the largest y-value)
  vSpacing = max(na.omit(data$marketShare)) * labelSpacing
  
  # Plot, ticks for every year
  g <- ggplot(data, aes_string(x = interval, y = var, group = 'ex', colour = 'ex')) + 
    geom_line(size=myLineSize) +
    ylab('Weekly Market Share \n (Proportion of Shares Traded)') +
    xlab('') +
    theme(axis.text.x  = element_text(size = 22), axis.text.y = element_text(size = 22), axis.title.x = element_text(size = 22), 
          axis.title.y = element_text(size = 22)) +
    scale_x_date(breaks = "1 year", date_labels = "%Y", minor_breaks = "1 year", limits = c(min(data$date),max(data$date)+myRightSpacing)) +
    theme(legend.position="none")
  
  # Add labels for the exchanges
  labels <- data[data$date == max(data$date),]
  labels <- labels[which(!is.na(labels[,var])),]
  labels <- adjust.exchangeLabels(labels, vSpacing, var)
  g <- g + geom_text(data = labels, size = 6, aes_string(label = 'ex'), hjust = 0)
  ggsave(file=paste(myOutputFolder, figurePath ,'.png', sep=""),
         plot = g, width = 18, height=8,dpi=200)
  
}

######## setting up project path and downloading data

## Importing data including trading volume data for each date-exchange-tradeType pair
mydata <- read.csv(paste0(dataPath, "Ex_Weekly_Shares_2007_2018.csv"))

## Exchange abbr
mydata$week <- as.Date(as.character(mydata$week), "%Y%m%d")
mydata$ExchangeName = getEXname(mydata$ex)
mydata$ex   <- getAbbr(mydata$ex)

# ## plotting market shares from july 2007 to 2018. These are not currently in use.
# plot.exchangeMarketShares(mydata[mydata$week < '2018-07-01',],
#                           interval = 'week', var = 'Sh_S_Standard',
#                           myOutputFolder = paste0(projectPath,"/Figures/"), 
#                           figurePath = paste0("weekly_shares_2007_2018"))
# 
# ## plotting market shares from 2011 to 2018
# plot.exchangeMarketShares(mydata[mydata$week > '2010-12-31' & mydata$week < '2018-07-01',],
#                           interval = 'week', var = 'Sh_S_Standard',
#                           myOutputFolder = paste0(projectPath,"/Figures/"), 
#                           figurePath = paste0("weekly_shares_2011_2018"))

## plotting market shares from july 2007 to 2015
plot.exchangeMarketShares(mydata[mydata$week < '2016-01-03',],
                          interval = 'week', var = 'Sh_S_Standard',
                          myOutputFolder = paste0(projectPath,"Figures/"), 
                          figurePath = paste0("weekly_shares_2007_2015"))

## plotting market shares from 2011 to 2015
plot.exchangeMarketShares(mydata[mydata$week > '2010-12-31' & mydata$week < '2016-01-03',],
                          interval = 'week', var = 'Sh_S_Standard',
                          myOutputFolder = paste0(projectPath,"Figures/"), 
                          figurePath = paste0("weekly_shares_2011_2015"))

#################################################################
# Get total volume traded on each exchange in 2015,
# and calculate market shares.
# Rank the exchanges by market share in 2015.
# Determine which exchanges are top 5, top 8, and regionals
# Determine the market share of the top 5, 3 taker-maker exchanges, 
# and regionals.
###################################################################

# Keep the data from 2015
mydata2015 = mydata[mydata$week < '2016-01-03',]
mydata2015 = mydata2015[mydata2015$week > '2014-12-28',]
mydata2015 = data.table(mydata2015)

# Sum traded volume for all weeks in 2015 by exchange
mydata2015 = mydata2015[, .(ExName = ExchangeName[1], 
                            Vol_S_US_Equity = sum(Vol_S_US_Equity), 
                            Vol_D_US_Equity = sum(Vol_D_US_Equity), 
                            Vol_S_Standard = sum(Vol_S_Standard), 
                            Vol_D_Standard = sum(Vol_D_Standard)), by = c('ex')]

# Sum to total traded volume for all weeks (across exchanges)
mydata2015$Total_S_US_Equity = sum(mydata2015$Vol_S_US_Equity)
mydata2015$Total_D_US_Equity = sum(mydata2015$Vol_D_US_Equity)
mydata2015$Total_S_Standard = sum(mydata2015$Vol_S_Standard)
mydata2015$Total_D_Standard = sum(mydata2015$Vol_D_Standard)

# Generate volume traded as a share of total volume for each exchange
mydata2015$Sh_S_US_Equity = mydata2015$Vol_S_US_Equity/mydata2015$Total_S_US_Equity
mydata2015$Sh_D_US_Equity = mydata2015$Vol_D_US_Equity/mydata2015$Total_D_US_Equity 
mydata2015$Sh_S_Standard = mydata2015$Vol_S_Standard/mydata2015$Total_S_Standard
mydata2015$Sh_D_Standard = mydata2015$Vol_D_Standard/mydata2015$Total_D_Standard 

# Order the data in decreasing order of volume
mydata2015 = mydata2015[order(-mydata2015$Vol_S_US_Equity)]

#################################################################
# Generate output reported in text for market shares. 
# Use Sh_S_Standard. This is the regular hours, non-auction, 
# on-exchange, standard sale condition volume in shares.
###################################################################

# Drop  to the top 5 exchanges using the data order to get the total share for the top 5. 
top_5 = mydata2015[1:5]

# Confirm these are the main maker-takers
table(top_5$ex)

# Find the sum of their shares and calculate the percentage.
top_5 = top_5[, .(Sh_S_US_Equity = sum(Sh_S_US_Equity), 
                  Sh_D_US_Equity = sum(Sh_D_US_Equity), 
                  Sh_S_Standard = sum(Sh_S_Standard), 
                  Sh_D_Standard = sum(Sh_D_Standard))]
top_5$Sh_S_Standard*100

# Look at the taker-maker exchanges using their labels: EDGA, NASDAQ BX and BATS Y 
taker_maker = mydata2015[mydata2015$ex == 'EDGA' | mydata2015$ex == 'BX'| mydata2015$ex == 'BYX']
taker_maker = taker_maker[, .(Sh_S_US_Equity = sum(Sh_S_US_Equity), 
                              Sh_D_US_Equity = sum(Sh_D_US_Equity), 
                              Sh_S_Standard = sum(Sh_S_Standard), 
                              Sh_D_Standard = sum(Sh_D_Standard))]
taker_maker$Sh_S_Standard*100

# Drop the data to the remaining 4 regional exchanges with non-zero volume.
# Since these are not the top 5 nor the top 8.
# We look fro the max share among those exchanges to confirm that none
# has more than 2% of the volume.
regional = mydata2015[9:12]
regional = regional[, .(Sh_S_US_Equity = max(Sh_S_US_Equity), Sh_D_US_Equity = max(Sh_D_US_Equity), Sh_S_Standard = max(Sh_S_Standard), 
                        Sh_D_Standard = max(Sh_D_Standard))]
regional$Sh_S_Standard*100


######################################
# Top 100
# Calculate the 2015 Market Share for
# Each Exchange at the level of the 
# year using the top 100 stocks.
######################################
mydata_T100 = read.csv(paste0(dataPath, "Ex_Weekly_Shares_2015_T100.csv"))

## Exchange abbr
mydata_T100$week <- as.Date(as.character(mydata_T100$week), "%Y%m%d")
mydata_T100$ExchangeName = getEXname(mydata_T100$ex)
mydata_T100$ex   <- getAbbr(mydata_T100$ex)

### Market Shares for 2015
# Keep the data from 2015
mydata2015_T100 = mydata_T100[mydata_T100$week < '2016-01-03',]
mydata2015_T100 = mydata2015_T100[mydata2015_T100$week > '2014-12-28',]
mydata2015_T100 = data.table(mydata2015_T100)

# Sum traded volume for all weeks in 2015 by exchange
mydata2015_T100 = mydata2015_T100[, .(ExName = ExchangeName[1], 
                                      Vol_S_US_Equity = sum(Vol_S_US_Equity_T100),
                                      Vol_D_US_Equity = sum(Vol_D_US_Equity_T100), 
                                      Vol_S_Standard = sum(Vol_S_Standard_T100), 
                                      Vol_D_Standard = sum(Vol_D_Standard_T100)), by = c('ex')]

# Sum to total traded volume for all weeks (across exchanges)
mydata2015_T100$Total_S_US_Equity = sum(mydata2015_T100$Vol_S_US_Equity)
mydata2015_T100$Total_D_US_Equity = sum(mydata2015_T100$Vol_D_US_Equity)
mydata2015_T100$Total_S_Standard = sum(mydata2015_T100$Vol_S_Standard)
mydata2015_T100$Total_D_Standard = sum(mydata2015_T100$Vol_D_Standard)

# Generate volume traded as a share of total volume for each exchange
mydata2015_T100$Sh_S_US_Equity = mydata2015_T100$Vol_S_US_Equity/mydata2015_T100$Total_S_US_Equity
mydata2015_T100$Sh_D_US_Equity = mydata2015_T100$Vol_D_US_Equity/mydata2015_T100$Total_D_US_Equity 
mydata2015_T100$Sh_S_Standard = mydata2015_T100$Vol_S_Standard/mydata2015_T100$Total_S_Standard
mydata2015_T100$Sh_D_Standard = mydata2015_T100$Vol_D_Standard/mydata2015_T100$Total_D_Standard 

# Order the data in decreasing order of volume
mydata2015_T100 = mydata2015_T100[order(-mydata2015_T100$Vol_S_US_Equity)]

write.csv(mydata2015, paste0(projectPath,'Tables/Raw/VolumeShares_2015.csv', sep = ''))
write.csv(mydata2015_T100, paste0(projectPath,'Tables/Raw/VolumeShares_2015_T100.csv', sep = ''))

###############################################################
# Top 8 
# Find R2 from regressing share on exchange for 2011 - 2015,
# At the daily level. 
###############################################################
## Importing data including trading volume data for each date-exchange-trade pair
mydata <- read.csv(paste0(dataPath, "Ex_Daily_Shares_2007_2018.csv"))

## Exchange abbr
mydata$date <- as.Date(as.character(mydata$DATE), "%Y%m%d")
mydata$ExchangeName = getEXname(mydata$ex)
mydata$ex   <- getAbbr(mydata$ex)
regression_data = mydata[mydata$date > '2010-12-31' & mydata$date < '2016-01-03' & !is.na(mydata$Sh_S_Standard),]

## Top 8
reg_top8 <- regression_data[regression_data$ExchangeName %in% top8,]
  
# To match the lm procedure to the "aggregate" procedure, you need to surpress the intercept (add a 0 on the RHS)
# and then calculate r2 <- 1 - sum(r$resid^2)/ss_tot and calculate 
# ss_tot = sum((data[,y] - mean(data[,y]))^2). We use the means calculation, but leave 
# this one in place in case we later want to change the method.

## Calculate R2 for period descibed in paper [up through 2015]
r <- lm(Sh_S_Standard ~ 0 + ex,reg_top8 ,na.action = na.omit)
sm <- summary(r)
resid <- sm$residuals
dif_mean <- reg_top8$Sh_S_Standard - mean(reg_top8$Sh_S_Standard)
R2 = 1 - sum(resid^2)/sum(dif_mean^2)
R2

# Alternate way to calculate R2
group_vars.t <- aggregate(Sh_S_Standard~1,reg_top8, function(x) sum((x - mean(x))^2))
ss_tot       <- sum(group_vars.t[, 'Sh_S_Standard'], na.rm = TRUE)
group_vars <- aggregate(Sh_S_Standard~ex, reg_top8, function(x) sum((x - mean(x))^2))
ss_res     <- sum(group_vars[, 'Sh_S_Standard'], na.rm = TRUE)
1 -ss_res/ss_tot

######################################################
# Compare volume traded in the top 100 to full sample
######################################################

## Importing data including trading volume data for each symbol-exchange-date
mydata <- read.csv(paste0(dataPath, "Sym_Ex_Date_Volume_2015.csv"))

## Import the Symbol-Universe
data_symStats <- read.csv(paste0(dataPath, "Symbol_Universe_2015.csv"))
data_symStats <- data_symStats[c("SYM_ROOT","SYM_SUFFIX","LISTED_MARKET","f_Sample_1","f_Sample_1_T100","ADV_S_Standard","n_Price_Standard")]

## Merge the Symbol-Universe onto the symbol-exchange-date volume
mydata <- merge(mydata, data_symStats, by = c("SYM_ROOT","SYM_SUFFIX"))
mydata = data.table(mydata)

## Find the total volume for all symbols, all symbols in the sample and for the top 100 in the sample.
mydata$Vol_S_Standard = as.numeric(mydata$Vol_S_Standard)
Vol_All = sum(mydata$Vol_S_Standard)

mydata_Sample = mydata[mydata$f_Sample_1 == 1,]
mydata_Sample$Vol_S_Standard = as.numeric(mydata_Sample$Vol_S_Standard)
Vol_Sample = sum(mydata_Sample$Vol_S_Standard)

mydata_T100 = mydata[mydata$f_Sample_1_T100==1,]
Vol_T100 = sum(mydata_T100$Vol_S_Standard)

## Calculate and report the percentage
Vol_T100/Vol_All*100
Vol_T100/Vol_Sample*100